library(tidyverse)
library(stargazer)
library(tidytext)
library(matrixStats)
library(haven)
library(urltools)


## ggplot settings
theme_set(theme_classic())

just_domain <- function(x) suffix_extract(url_parse(x)$domain)$domain


alexa355 <- read.csv("../data/politicaldomains-final-edits.csv") %>%
  select(domain, type)
alexa355$domain <- just_domain(alexa355$domain)
alexa355 <- alexa355 %>% mutate(
  type = as.character(type),
  type = ifelse(type == "aggregator", "portal", type))




### Guess (2019) Appendix D1 regressions

news <- read_csv("../results/merged_news.csv")

gdf <- news %>% select(caseid, b_align, weight,
  age, race, female, income, ideology, newsint,
  educ, pid)

mean_align <- gdf %>% select(caseid, b_align) %>%
  group_by(caseid) %>% 
  summarize(mean_align = mean(b_align, na.rm = TRUE))

gdf <- mean_align %>%
  left_join(gdf %>% select(-b_align) %>%distinct(),
    by = "caseid") %>% 
  distinct()

age_levels <- paste0("Age: ", c("Under 30", "30-44", "45-59", "60+"))
race_levels <- paste0("Race: ", c("Other", "White", "Black", "Hispanic"))
educ_levels <- paste0("Educ.: ", c("Some or no HS", "High school", "Some college", "College graduate", "Postgraduate"))
party_levels <- paste0("Party: ", c("Independent", "Democrat",
  "Republican"))
ideo_levels <- paste0("Ideo.: ", c("Moderate", "Liberal", "Conservative", "Very conservative", "Very liberal"))

inc_levels <- c("$10,000 - $19,999",
  "$20,000 - $29,999", "$50,000 - $59,999", 
  "$70,000 - $79,999",   "$30,000 - $39,999",
  "$120,000 - $149,999",
  "$80,000 - $99,999",   "$150,000 - $199,999", "$40,000 - $49,999",
  "$100,000 - $119,999", "$60,000 - $69,999",   "$150,000 or more",
  "Less than $10,000", "$350,000 - $499,999", "$200,000 - $249,999",
  "$250,000 - $349,999")

inc_levels <- inc_levels[c(13, 1, 2, 5, 9, 3, 11, 4,
  7, 10, 6, 8, 15, 16)]

ideo_levels <- c("Conservative", "Liberal", "Moderate",
  "Very conservative","Very liberal")
ideo_levels <- ideo_levels[c(5, 2, 3, 1, 4)]


gdf <- gdf %>% mutate(
  Age = factor(case_when(
    age < 30 ~ age_levels[1],
    age >= 30 & age <= 45 ~ age_levels[2],
    age >= 45 & age <= 59 ~ age_levels[3],
    age >= 60 ~ age_levels[4]), levels = age_levels),
  Race = factor(case_when(
    race == "White" ~ race_levels[2],
    race == "Black" ~ race_levels[3],
    race == "Hispanic" ~ race_levels[4],
    TRUE ~ race_levels[1]), levels = race_levels),
  Educ = factor(case_when(
    educ == "No HS" ~ educ_levels[1],
    educ == "High school graduate" ~ educ_levels[2],
    educ %in% c("Some college", "2-year") ~ educ_levels[3],
    educ == "4-year" ~ educ_levels[4],
    educ == "Post-grad" ~ educ_levels[5]), levels = educ_levels),
  Party = factor(case_when(
    pid <= 3 ~ party_levels[2],
    pid == 4 ~ party_levels[1],
    pid >= 5 ~ party_levels[3]), levels = party_levels),
  Income = factor(income, levels = inc_levels),
  Income_Numeric = as.integer(Income),
  Ideology = factor(ideology, levels = ideo_levels),
  Ideology_Numeric = as.integer(Ideology),
  Female = 1 * (female == "Female"),
)

greg <- lm(mean_align ~ Age + Race + Female + Income_Numeric +
  Educ + Party + Party,
  data = gdf, weights = gdf$weight)

greg2 <- lm(mean_align ~ Age + Race + Female + Income_Numeric +
  Educ + Ideology_Numeric + Party * newsint,
  data = gdf, weights = gdf$weight)


library(sandwich)
library(lmtest)

gtab <- coeftest(greg, vcov. = vcovHC)
gtab2 <- coeftest(greg2, vcov. = vcovHC)

stargazer(greg, greg2,
  type = "latex", style = "ajps",
  out = "../results/tables/slant_on_covariates.tex",
  digits = 2,
  dep.var.labels = "Individual-level Mean Article Alignment",
  covariate.labels = c(age_levels[-1], race_levels[-1],
    "Female", "Income level", educ_levels[-1],
    "7-pt Ideology",
    party_levels[-1], "Political Interest (0/1)",
    "Democrat*Political Interest", "Republican*Political Interest"),
  se = list(gtab[, 2], gtab2[, 2]),
  keep.stat = "n",
  notes = "Standard errors robust to heteroscedasticity (HC1).",
  add.lines = list(c("Std. Dev.", rep(round(sd(gdf$mean_align, na.rm = TRUE), 2), 2))),
  label = "t:slant_on_covariates")

###


### START overlapping coefficient

library(bayestestR)

pdf <- news %>% filter(!is.na(b_align),
  party %in% c("dem", "rep")) %>%
  mutate(party = ifelse(party == "dem", "DEM", "REP")) %>%  
  group_by(caseid, party) %>%
  summarize(mean_align = mean(b_align)) %>% 
  ungroup() 

rep <- pdf %>% filter(party == "REP") %>% pull(mean_align)
dem <- pdf %>% filter(party == "DEM") %>% pull(mean_align)
overlap(rep, dem)

p <- pdf %>% 
  ggplot(aes(mean_align, fill = party)) +
  geom_density(alpha = 0.5) +
  scale_fill_grey() +
  theme_classic() +
  ggtitle("Distribution of Avg. Alignment By Party") +
  labs(x = "Avg. Bakshy et al. Alignment",
    y = "Density", fill = "Party") +
  lims(x = c(-1, 1))
ggsave("../results/figures/avg_align_dist.pdf", device = "pdf", width = 6, height = 4)


# now without portals
pdf <- news %>% filter(portal == 0,
  !is.na(b_align),
  party %in% c("dem", "rep")) %>%
  mutate(party = ifelse(party == "dem", "DEM", "REP")) %>%  
  group_by(caseid, party) %>%
  summarize(mean_align = mean(b_align)) %>% 
  ungroup() 

rep <- pdf %>% filter(party == "REP") %>% pull(mean_align)
dem <- pdf %>% filter(party == "DEM") %>% pull(mean_align)
overlap(rep, dem)

p <- pdf %>% 
  ggplot(aes(mean_align, fill = party)) +
  geom_density(alpha = 0.5) +
  scale_fill_grey() +
  theme_classic() +
  ggtitle("Distribution of Non-Portal Avg. Alignment By Party") +
  labs(x = "Avg. Bakshy et al. Alignment",
    y = "Density", fill = "Party") +
  lims(x = c(-1, 1))
ggsave("../results/figures/avg_align_dist_nonportal.pdf", device = "pdf", width = 6, height = 4)


### END overlapping coefficient




news <- news %>% 
  filter(party %in% c("dem", "rep")) %>% 
  rename(gender = female)


### BEGIN Investigate Partisan Strength

news <- news %>% mutate(strength = case_when(
  pid %in% c(1, 7) ~ "strong",
  pid %in% c(2, 6) ~ "weak",
  pid %in% c(3, 5) ~ "leaning",
  pid %in% c(4) ~ "true_ind"))

subs <- news %>% filter(!is.na(bakshy_domain)) %>% 
  select(caseid, party, strength) %>%
  group_by(caseid, party, strength) %>% 
  summarise(total_visits = n()) %>% ungroup()

ldf <- news %>% group_by(bakshy_domain, caseid) %>% 
  summarise(visits = n()) %>% ungroup() %>% 
  complete(bakshy_domain, caseid, fill = list(visits = 0)) %>% 
  left_join(subs, by = "caseid") %>% 
  filter(!is.na(total_visits))

ldf <- news %>% select(bakshy_domain, b_align) %>% distinct() %>% 
  right_join(ldf, by = "bakshy_domain")

ldf <- ldf %>% mutate(
  rep = 1 * (party == "rep"),
  dem = 1 * (party == "dem"),
  strong = 1 * (strength == "strong"),
  weak = 1 * (strength == "weak"),
  leaner = 1 * (strength == "leaning"),
  pct_diet = 100.0 * visits / total_visits,
  visit_once = 1 * (visits > 0))

ldf <- ldf %>% filter(!is.na(bakshy_domain))

ldf <- ldf %>% left_join(news %>% select(caseid, weight) %>% distinct(), by = "caseid")


library(lfe)
library(lmtest)
library(sandwich)

mod5 <- felm(
  visit_once ~ b_align:rep | caseid + bakshy_domain | 0 | caseid,
  data = ldf,
  weights = ldf$weight)

mod6 <- felm(
  visit_once ~ b_align:rep + b_align:rep:leaner +
  b_align:rep:weak| caseid + bakshy_domain | 0 | caseid,
  data = ldf,
  weights = ldf$weight)

num_ind <- ldf %>% filter(!is.na(b_align)) %>%  pull(caseid) %>% n_distinct()
num_src <- ldf %>% filter(!is.na(b_align)) %>% pull(bakshy_domain) %>% n_distinct()


dv_mean <- sum(ldf$visit_once * ldf$weight, na.rm = TRUE) / sum(ldf$weight)

stargazer(mod5, mod6,
  type = "latex", style = "ajps",
  out = "../results/tables/party_align_twoway_fe.tex",
  label = "t:party_align_twoway_fe",
  title = "Two-way fixed effects: party identification and source alignment.",
  notes = "Standard errors clustered by individual. Regressions computed using \\texttt{lfe} package \\citep{lfe}.",
  digits = 3,
  dep.var.labels = c(
    rep("Visit Domain", 1)),
  covariate.labels = c(
    "REP * Alignment",
    "REP * Alignment * Leaner",
    "REP * Alignment * Weak"),
  keep.stat = "n",
  add.lines = list(
    c("Weighted Mean", rep(round(dv_mean, 3), 2)),
    c("Individuals", rep(num_ind, 2)),
    c("Sources", rep(num_src, 2)),
    c("Individual FEs", rep("\\checkmark", 2)),
    c("Source FEs", rep("\\checkmark", 2)),
    c("Weighted", rep("\\checkmark", 2))))

ldf <- na.omit(ldf)
rmod <- felm(
  visit_once ~ 1 | caseid + bakshy_domain, data = ldf)
ldf$r <- resid(rmod)

newx <- tibble(b_align = seq(-1, 1, by = 0.01))
newx$rep <- 1
newx$dem <- 0
newx <- newx %>% mutate(rep = 0, dem = 1) %>% bind_rows(newx)
newx <- newx %>%
  mutate(Party = case_when(dem == 1 ~ "DEM", rep == 1 ~ "REP"))

library(estimatr) # Prefer how they return confints
y_mod <- ldf %>%
  lm_robust(visit_once ~ b_align:rep + b_align:dem, data = .,
  clusters = caseid)
y_pred <- predict(y_mod, newx, interval = "confidence")$fit
y_df <- cbind(newx, y_pred) %>% as_tibble()

ry_mod <- ldf %>%
  lm_robust(r ~ b_align:rep + b_align:dem,
    data = .,  clusters = caseid)
ry_pred <- predict(ry_mod, newx, interval = "confidence")$fit
ry_df <- cbind(newx, ry_pred) %>% as_tibble()


ldf <- ldf %>%
  mutate(party = case_when(
    party == "dem" ~ "DEM",
    party == "rep" ~ "REP")) %>% 
  rename(Party = party)

p <- ldf %>% ggplot(aes(b_align, r,
    color = Party, fill = Party)) +
  geom_line(data = ry_df, mapping = aes(b_align, fit,
    color = Party)) +
  geom_ribbon(data = ry_df, mapping = aes(b_align, fit,
    ymin = lwr, ymax = upr, fill = Party),
    alpha = 0.5, color = NA) +
  theme_classic() +
  stat_summary_bin(alpha = 0.8,
    fun.y = "mean", bins = 21, geom = "point") +
  scale_fill_grey() +
  scale_colour_grey() +
  labs(x = "Domain's Bakshy et al. (2015) Alignment Score",
    y = "Adjusted Probability of Visiting") +
  ggtitle("Adjusted Probability of Visiting and Source Alignment")
ggsave("../results/figures/visit_vs_align_residualized.pdf", device = "pdf",
  width = 6, height = 4)

p <- ldf %>% ggplot(aes(b_align, visit_once,
    color = Party, fill = Party)) +
  geom_line(data = y_df, mapping = aes(b_align, fit,
    color = Party)) +
  geom_ribbon(data = y_df, mapping = aes(b_align, fit,
    ymin = lwr, ymax = upr, fill = Party),
    alpha = 0.5, color = NA) +
  theme_classic() +
  stat_summary_bin(alpha = 0.8,
    fun.y = "mean", bins = 21, geom = "point") +
  scale_fill_grey() +
  scale_colour_grey() +
  labs(x = "Domain's Bakshy et al. (2015) Alignment Score",
    y = "Probability of Visiting") +
  ggtitle("Probability of Visiting and Source Alignment")
ggsave("../results/figures/visit_vs_align.pdf", device = "pdf",
  width = 6, height = 4)




### END






domain_list <- news %>% pull(domain) %>% unique() %>% sort()
domain_list <- c(domain_list, "")
stargazer(matrix(domain_list, ncol = 5),
  style = "ajps", type = "latex",
  out = "../results/tables/domain_list.tex",
  table.placement = "H",
  title = "List of Alexa News Domains")



# Summary Statistics: Median, Average Articles/Day
# Subject to attrition though
num_dates <- news %>% pull(date) %>% n_distinct()

median_articles <- news %>% group_by(party, caseid) %>%
  summarise(n = n(), day_avg = n/num_dates) %>% 
  ungroup() %>%
  group_by(party) %>% 
  summarise(median_day_avg = median(day_avg),
    mean_day_avg = mean(day_avg)) %>% 
  rename(Party = party, `Median Articles/Day` = median_day_avg,
    `Mean Articles/Day` = mean_day_avg) %>% 
  mutate(Party = recode(Party,
    dem = "Democrats", rep = "Republicans", ind = "Independents")) %>% 
  data.frame()
colnames(median_articles)[2] <- "Median URLs/Day"
colnames(median_articles)[3] <- "Mean URLs/Day"
stargazer(median_articles,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Political URLs/Day by Party",
  out = "../results/tables/summary_articles_day.tex",
  table.placement = "H")




### Daily Source Shares (Bakshy variant)

bnews <- news %>% filter(!is.na(bakshy_domain))

subject_weights <- bnews %>% select(caseid, weight) %>% distinct()

user_visits <- bnews %>% group_by(caseid, date, party, bakshy_domain) %>%
  summarise(num_visits = n()) %>% 
  ungroup()

user_visits <- user_visits %>%
  complete(nesting(caseid, party), bakshy_domain, date,
    fill = list(num_visits = 0))

total_visits <- user_visits %>% group_by(caseid, party, date) %>% 
  summarise(total = sum(num_visits),
    no_bnews = 1 * (total == 0)) %>% ungroup()

# total_visits %>%
#   group_by(caseid, party) %>%
#   summarise(m = mean(no_bnews)) %>%
#   ungroup() %>%
#   group_by(party) %>%
#   summarise(mean_nn = mean(m), median_nn = median(m))
#   party mean_nn median_nn
#   <chr>   <dbl>     <dbl>
# 1 dem     0.625     0.727
# 2 rep     0.685     0.768

user_visits <- user_visits %>% left_join(total_visits %>% select(-no_bnews),
  by = c("caseid", "party", "date"))
party_shares <- user_visits %>%
  mutate(share = ifelse(total == 0, 0, num_visits / total))

empty_visits <- total_visits %>% 
  mutate(bakshy_domain = "no bnews", share = ifelse(no_bnews == 1, 1, 0)) %>% 
  select(-total, -no_bnews)

party_shares <- party_shares %>% bind_rows(empty_visits) %>% 
  left_join(subject_weights, by = "caseid") %>% 
  group_by(party, bakshy_domain, date) %>% 
  summarise(share = weighted.mean(share, weight))

party_shares <- party_shares %>% ungroup() %>% 
  group_by(party, bakshy_domain) %>% 
  summarise(share = mean(share)) %>% 
  arrange(-share) %>% 
  ungroup()

dem_shares <- party_shares %>% filter(party == "dem") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(dem_shares) <- c("Domain", "Average Share (%)",
  "Cumulative Share (%)")
stargazer(dem_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Democratic Domain Shares (Weighted)",
  out = "../results/tables/source_share_dem_nobnews.tex",
  table.placement = "H")

rep_shares <- party_shares %>% filter(party == "rep") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(rep_shares) <- c("Domain", "Average Share (%)",
  "Cumulative Share (%)")
stargazer(rep_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Republican Domain Shares (Weighted)",
  out = "../results/tables/source_share_rep_nobnews.tex",
  table.placement = "H")


## Source Shares (Bakshy variant)

subject_weights <- bnews %>% select(caseid, weight) %>% distinct()

user_visits <- bnews %>% group_by(caseid, party, bakshy_domain) %>%
  summarise(num_visits = n()) %>% 
  ungroup()

user_visits <- user_visits %>% group_by(caseid, party) %>%
  summarise(total = sum(num_visits)) %>% 
  left_join(user_visits, ., by = c("caseid", "party")) %>% 
  mutate(share = num_visits / total)

user_visits <- user_visits %>% select(-total, -num_visits) %>% 
  complete(nesting(caseid, party), bakshy_domain, fill = list(share = 0))

party_shares <- user_visits %>%
  left_join(subject_weights, by = "caseid") %>% 
  group_by(bakshy_domain, party) %>% 
  do(mod = lm(share ~ 1, weights = weight, data = .)) %>% 
  mutate(share = summary(mod)$coefficients[1],
    share_se = summary(mod)$coefficients[2]) %>% 
  select(-mod) %>% 
  arrange(-share) %>% 
  ungroup()


dem_shares <- party_shares %>% filter(party == "dem") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(dem_shares) <- c("Domain", "Average Share (%)",
  "Std. Error (%)", "Cumulative Share (%)")
stargazer(dem_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Democratic Domain Shares (Weighted)",
  out = "../results/tables/source_share_dem.tex",
  table.placement = "H")

rep_shares <- party_shares %>% filter(party == "rep") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(rep_shares) <- c("Domain", "Average Share (%)",
  "Std. Error (%)", "Cumulative Share (%)")
stargazer(rep_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Republican Domain Shares (Weighted)",
  out = "../results/tables/source_share_rep.tex",
  table.placement = "H")



plot_df <- bnews %>% select(bakshy_domain, b_align) %>%
  distinct() %>% 
  right_join(party_shares, by = "bakshy_domain")
plot_df <- plot_df %>% select(-share_se) %>% spread(party, share)

p <- plot_df %>% arrange(-dem) %>%
  head(n = 30) %>% 
  mutate(bakshy_domain = factor(bakshy_domain,
    levels = rev(bakshy_domain))) %>%
  ggplot(aes(bakshy_domain, b_align, size = dem)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, lty = 3) +
  lims(y = c(-1, 1)) +
  labs(x = "Source", y = "Bakshy et al. Alignment Scores") +
  ggtitle("Top 30 Democratic Political News Sources (Weighted)") +
  guides(size = FALSE) +
  theme(legend.title = element_blank(), legend.position = "bottom")
ggsave("../results/figures/market_share_dem_bnews.pdf", device = "pdf", height = 6, width = 6)


p <- plot_df %>% arrange(-rep) %>%
  head(n = 30) %>% 
  mutate(bakshy_domain = factor(bakshy_domain,
    levels = rev(bakshy_domain))) %>%
  ggplot(aes(bakshy_domain, b_align, size = rep)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, lty = 3) +
  lims(y = c(-1, 1)) +
  labs(x = "Source", y = "Bakshy et al. Alignment Scores") +
  ggtitle("Top 30 Republican Political News Sources (Weighted)") +
  guides(size = FALSE) +
  theme(legend.title = element_blank(), legend.position = "bottom")
ggsave("../results/figures/market_share_rep_bnews.pdf", device = "pdf", height = 6, width = 6)








## Daily Source Shares (REP Fav.)

subject_weights <- news %>% select(caseid, weight) %>% distinct()

user_visits <- news %>% group_by(caseid, date, party, domain) %>%
  summarise(num_visits = n()) %>% 
  ungroup()

user_visits <- user_visits %>%
  complete(nesting(caseid, party), domain, date,
    fill = list(num_visits = 0))

total_visits <- user_visits %>% group_by(caseid, party, date) %>% 
  summarise(total = sum(num_visits),
    no_news = 1 * (total == 0))

# total_visits %>%
#   group_by(caseid, party) %>%
#   summarise(m = mean(no_news)) %>%
#   ungroup() %>%
#   group_by(party) %>%
#   summarise(mean_nn = mean(m), median_nn = median(m))
#   party mean_nn median_nn
#   <chr>   <dbl>     <dbl>
# 1 dem     0.625     0.727
# 2 rep     0.685     0.768

user_visits <- user_visits %>% left_join(total_visits %>% select(-no_news),
  by = c("caseid", "party", "date"))
party_shares <- user_visits %>%
  mutate(share = ifelse(total == 0, 0, num_visits / total))

empty_visits <- total_visits %>% 
  mutate(domain = "no news", share = ifelse(no_news == 1, 1, 0)) %>% 
  select(-total, -no_news)

party_shares <- party_shares %>% bind_rows(empty_visits) %>% 
  left_join(subject_weights, by = "caseid") %>% 
  group_by(party, domain, date) %>% 
  summarise(share = weighted.mean(share, weight))

party_shares <- party_shares %>% ungroup() %>% 
  group_by(party, domain) %>% 
  summarise(share = mean(share)) %>% 
  arrange(-share) %>% 
  ungroup()

dem_shares <- party_shares %>% filter(party == "dem") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(dem_shares) <- c("Domain", "Average Share (%)",
  "Cumulative Share (%)")
stargazer(dem_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Democratic Domain Shares (Weighted)",
  out = "../results/tables/source_share_dem_nonews.tex",
  table.placement = "H")

rep_shares <- party_shares %>% filter(party == "rep") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(rep_shares) <- c("Domain", "Average Share (%)",
  "Cumulative Share (%)")
stargazer(rep_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Republican Domain Shares (Weighted)",
  out = "../results/tables/source_share_rep_nonews.tex",
  table.placement = "H")


party_nn <- party_shares %>% filter(domain == "no news") %>% 
  select(-domain) %>% 
  mutate(party = ifelse(party == "rep", "Republican", "Democrat")) %>% 
  mutate(share = 100 * share) %>% 
  as.data.frame()
colnames(party_nn) <- c("Party", "Percent of Days without Political News URL (%)")
stargazer(party_nn,
  digits = 1,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Frequency of Political News URL Consumption (Weighted)",
  out = "../results/tables/percent_nonews.tex",
  table.placement = "H")

# Table with NN and other domains
# Table with percent of days with NN by party
# Table with heterogeneity in party




### EXPERIMENT BEGIN

subject_weights <- news %>% select(caseid, weight) %>% distinct()

user_visits <- news %>% group_by(caseid, party, domain) %>%
  summarise(num_visits = n()) %>% 
  ungroup()

user_visits <- user_visits %>% 
  complete(nesting(caseid, party), domain,
    fill = list(num_visits = 0))

party_shares <- user_visits %>%
  left_join(subject_weights, by = "caseid") %>% 
  group_by(domain, party) %>%
  summarize(prob_nonzero = sum((num_visits > 0) * weight) / sum(weight),
    median_nonzero_visits = median(num_visits[num_visits > 0])) %>% 
  ungroup() %>% 
  mutate(m = prob_nonzero * median_nonzero_visits)

party_shares <- party_shares %>% group_by(domain) %>% 
  summarize(both = sum(m)) %>% 
  right_join(party_shares, by = "domain") %>% 
  arrange(-both)

p <- party_shares %>%
  head(n = 2 * 30) %>% 
  mutate(domain = factor(domain, levels = rev(unique(domain))),
    party = factor(party, levels = rev(unique(party)))) %>% 
  ggplot(aes(domain, m, fill = party)) +
  geom_col(alpha = 0.65) +
  coord_flip() +
  scale_fill_grey() +
  theme_classic()


### EXPERIMENT END














## Source Shares

subject_weights <- news %>% select(caseid, weight) %>% distinct()

user_visits <- news %>% group_by(caseid, party, domain) %>%
  summarise(num_visits = n()) %>% 
  ungroup()

user_visits <- user_visits %>% group_by(caseid, party) %>%
  summarise(total = sum(num_visits)) %>% 
  left_join(user_visits, ., by = c("caseid", "party")) %>% 
  mutate(share = num_visits / total)

user_visits <- user_visits %>% select(-total, -num_visits) %>% 
  complete(nesting(caseid, party), domain, fill = list(share = 0))


party_shares <- user_visits %>%
  left_join(subject_weights, by = "caseid") %>% 
  group_by(domain, party) %>%
  do(mod = lm(share ~ 1, weights = weight, data = .)) %>% 
  mutate(
    share = summary(mod)$coefficients[1],
    share_se = summary(mod)$coefficients[2]) %>% 
  select(-mod) %>% 
  arrange(-share) %>% 
  ungroup()



dem_shares <- party_shares %>% filter(party == "dem") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(dem_shares) <- c("Domain", "Average Share (%)",
  "Std. Error (%)", "Cumulative Share (%)")
stargazer(dem_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Democratic Domain Shares (Weighted)",
  out = "../results/tables/source_share_dem.tex",
  table.placement = "H")

rep_shares <- party_shares %>% filter(party == "rep") %>%
  select(-party) %>%
  mutate(cumulative = cumsum(share)) %>% 
  mutate_at(-1, ~ round(. * 100, 3)) %>%
  head(n = 30) %>%
  as.data.frame()
colnames(rep_shares) <- c("Domain", "Average Share (%)",
  "Std. Error (%)", "Cumulative Share (%)")
stargazer(rep_shares,
  digits = 2,
  summary = FALSE, rownames = FALSE,
  style = "ajps", type = "latex",
  title = "Republican Domain Shares (Weighted)",
  out = "../results/tables/source_share_rep.tex",
  table.placement = "H")





plot_df <- party_shares %>% select(-share_se) %>%
  spread(party, share) %>%
  mutate(both = dem + rep) %>% arrange(-both) %>%
  mutate(rep_fav = (rep - dem)/both)




##### DOMAIN CATEGORIES



legacy_list <- c("cable television", "foreign newspaper", "foreign television", "local newspaper", "local radio", "local television", "magazine",
  "national newspaper", "national television", "radio", "student newspaper",
  "wire service")
online_list <- c("foreign online", "online", "online newspaper")
polling_list <- c("fivethirtyeight", "realclearpolitics", "rasmussenreports")

plot_df <- plot_df %>% left_join(alexa355 %>% distinct(), by = "domain")

plot_df <- plot_df %>% mutate(
  domain_cat = case_when(
    type %in% legacy_list ~ "legacy",
    domain %in% polling_list ~ "polling",
    type %in% online_list ~ "online",
    type == "portal" ~ "portal",
    TRUE ~ "OTHER"
    ))

#####



plot_df <- plot_df %>%
  mutate(domain_cat = factor(domain_cat, levels = unique(domain_cat))) %>% 
  rename(rel_diff = rep_fav)


a <- plot_df %>% arrange(-dem) %>%
  head(n = 30) %>% 
  mutate(domain = factor(domain, levels = rev(domain))) %>%
  ggplot(aes(domain, rel_diff, shape = domain_cat, size = dem)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, lty = 3) +
  lims(y = c(-1, 1)) +
  labs(x = "Domain", y = "Favored by Democrats (-1) vs. Republicans (+1)") +
  ggtitle("Top 30 Democratic Political News Diets (Weighted)") +
  guides(size = FALSE) +
  theme(legend.title = element_blank(), legend.position = "bottom")
ggsave("../results/figures/market_share_dem.pdf", device = "pdf",
  plot = a, height = 6, width = 6)


p <- plot_df %>% arrange(-dem) %>%
  mutate(domain = factor(domain, levels = rev(domain))) %>%
  ggplot(aes(rel_diff, dem)) +
  geom_point() +
  geom_smooth()
  


b <- plot_df %>% arrange(-rep) %>%
  head(n = 30) %>% 
  mutate(domain = factor(domain, levels = rev(domain))) %>%
  ggplot(aes(domain, rel_diff, shape = domain_cat, size = rep)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, lty = 3) +
  lims(y = c(-1, 1)) +
  labs(x = "Domain", y = "Favored by Democrats (-1) vs. Republicans (+1)") +
  ggtitle("Top 30 Republican Political News Diets (Weighted)") +
  guides(size = FALSE) +
  theme(legend.title = element_blank(), legend.position = "bottom")
ggsave("../results/figures/market_share_rep.pdf", device = "pdf",
  plot = b, height = 6, width = 6)


b <- plot_df %>% arrange(-rep) %>%
  head(n = 30) %>% 
  mutate(domain = factor(domain, levels = rev(domain))) %>%
  ggplot(aes(domain, rel_diff, shape = domain_cat, size = rep)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, lty = 3) +
  lims(y = c(-1, 1)) +
  labs(x = "", y = "Favored by Democrats (-1) vs. Republicans (+1)") +
  ggtitle("Top 30 Republican Political News Diets (Weighted)") +
  guides(size = FALSE) +
  theme(legend.title = element_blank(), legend.position = "bottom")

library(ggpubr)
# ggarrange(a, b, ncol = 2)
# ggsave("../results/figures/market_share_both.pdf", device = "pdf", height = 6, width = 12)

p <- ggarrange(a, b, ncol = 2) %>% 
ggexport(p,
  filename = "../results/figures/market_share_both.pdf",
  height = 6, width = 12)





### Rep favorability vs. bakshy align




b_news <- news %>% select(domain, b_align) %>%
  group_by(domain) %>% 
  summarise(b_align = mean(b_align, na.rm = TRUE))

weighted_cor <- plot_df %>% select(domain, both, rel_diff) %>%
  left_join(b_news, by = "domain") %>% 
  na.omit() %>% 
  with(lm(b_align ~ rel_diff, weights = both))
weighted_cor <- sqrt(summary(weighted_cor)$r.squared) %>% round(2)

p <- plot_df %>% 
  left_join(b_news, by = "domain") %>% 
  ggplot(aes(rel_diff, b_align, size = both, weight = both)) +
  geom_point(alpha = 0.7) + geom_smooth(se = FALSE) +
  labs(
    title = "Share-Based Measure of Source Ideology vs. REP Favorability",
    subtitle = paste0("Traffic-Weighted Correlation = ", weighted_cor),
    x = "REP Favorability",
    y = "Bakshy et al. (2015) Alignment Score") +
  geom_abline(slope = 1, intercept = 0, lty = 2) +
  guides(size = FALSE)
ggsave("../results/figures/bakshy_vs_rep_fav.pdf", device = "pdf", height = 6, width = 6)






## Subject-level analysis

plot_df <- plot_df %>% rename(rep_fav = rel_diff)

subjects <- news %>% 
  left_join(plot_df, by = "domain") %>% 
  group_by_at(vars(caseid, pid:weight)) %>%
  summarise(
    num_caseid_visits = n(),
    avg_b_align = mean(b_align, na.rm = TRUE),
    avg_b_align_noportal = mean(b_align[portal == 0], na.rm = TRUE),
    avg_rep_fav = mean(rep_fav, na.rm = TRUE),
    avg_rep_fav_noportal = mean(rep_fav[portal == 0], na.rm = TRUE),
    portal_frac = mean(portal),
    LPU = portal_frac < 0.5) %>%
  ungroup()



p <- subjects %>%
  mutate(Party = ifelse(party == "dem", "DEM", "REP")) %>% 
  ggplot(aes(portal_frac, pid))



p <- subjects %>%
  mutate(Party = ifelse(party == "dem", "DEM", "REP")) %>% 
  ggplot(aes(portal_frac, avg_rep_fav, weight = weight, shape = Party,
    lty = Party)) +
  geom_smooth(color = "black") +
  geom_rug(alpha = 0.5, sides = "b")+
  coord_cartesian(ylim = c(-0.5, 0.5)) +
  geom_hline(yintercept = 0, lty = 3) +
  labs(x = "Fraction of Political News URLs from Portals",
    y = "Average Individual REP Favorability") +
  ggtitle("Source Polarization and Portal Share (Weighted)")
ggsave("../results/figures/polarization_vs_portal.pdf", device = "pdf", height = 4, width = 7)


p <- subjects %>%
  mutate(Party = ifelse(party == "dem", "DEM", "REP")) %>% 
  ggplot(aes(portal_frac, avg_rep_fav_noportal, weight = weight, shape = Party,
    lty = Party)) +
  geom_smooth(color = "black") +
  geom_rug(alpha = 0.5, sides = "b")+
  coord_cartesian(ylim = c(-0.5, 0.5)) +
  geom_hline(yintercept = 0, lty = 3) +
  labs(x = "Fraction of Political News URLs from Portals",
    y = "Average Individual Non-Portal REP Favorability") +
  ggtitle("Non-Portal Source Polarization and Portal Share (Weighted)")
ggsave("../results/figures/nonportal_polarization_vs_portal.pdf", device = "pdf", height = 4, width = 7)



## Portal Share
p <- subjects %>%
  mutate(party = case_when(party == "rep" ~ "REP", party == "dem" ~ "DEM")) %>% 
  ggplot(aes(portal_frac, weight = weight)) +
  geom_density(fill = "gray35", color = "white") +
  labs(x = "Fraction of Political News URLs from Portals",
    y = "Density") +
  facet_wrap(~ party) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  ggtitle("Distribution of Portal Share (Weighted)")
ggsave("../results/figures/portal_frac.pdf", device = "pdf", height = 4, width = 6)


p <- subjects %>% ggplot(aes(portal_frac, num_caseid_visits, weight = weight)) +
  geom_smooth(col = "black") +
  labs(x = "Fraction of Political News URLs from Portals",
    y = "Number of Political News URL Visits") +
  ggtitle("News Consumption and Portal Share (Weighted)")
ggsave("../results/figures/consumption_vs_portal.pdf", device = "pdf", height = 4, width = 6)

subjects %>% group_by(party) %>% summarise(m = weightedMedian(portal_frac, weight))


survey <- read_sav("../data/November Survey/STAN0089_OUTPUT_all.sav")

survey <- survey %>%
  mutate(
    internet_primary = case_when(
    main_news_source_2_pre == 1 ~ 1,
    main_news_source_3_pre == 1 ~ 1,
    main_news_source_4_pre == 1 ~ 1,
    main_news_source_5_pre == 1 ~ 1,
    main_news_source_6_pre == 1 ~ 1,
    main_news_source_7_pre == 1 ~ 1,
    TRUE ~ 0)
    )

p <- survey %>% select(caseid, internet_primary) %>%
  mutate(caseid = as.integer(caseid)) %>% 
  left_join(subjects, .) %>% 
  ggplot(aes(portal_frac, internet_primary, weight = weight)) +
  geom_smooth(col = "black") +
  labs(x = "Fraction of Political News URLs from Portals",
    y = "Internet Is Main News Source") +
  ggtitle("Internet News Consumption and Portal Share (Weighted)")
ggsave("../results/figures/internet_vs_portal.pdf", device = "pdf", height = 4, width = 6)